Bibliotecas utilizadas para o estudo
library(tidyverse)
library(lubridate)
library(corrplot)
library(plotly)
library(readxl)
library(fastDummies)
library(AmesHousing)
library(recipes)
library(caret)
library(urca)
library(car)
library(forecast)
library(rpart)
library(rpart.plot)
library(ranger)
library(randomForest)
Analises descritivas do dataset
glimpse(baba_all)
## Rows: 396
## Columns: 12
## $ date <date> 2014-08-01, 2014-08-02, 2014-08-03, 2014-08-04, 201...
## $ mes <chr> "agosto", "agosto", "agosto", "agosto", "agosto", "a...
## $ weekday <chr> "5", "6", "7", "1", "2", "3", "4", "5", "6", "7", "1...
## $ margem <dbl> 0.4061105, 0.4160218, 0.4319928, 0.4092159, 0.449647...
## $ venda <dbl> 110042.46, 58377.32, 64635.39, 140417.32, 149700.29,...
## $ desconto <dbl> 9190.908, 5713.043, 8621.709, 18312.966, 19942.741, ...
## $ outdesc <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ outmg <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ conj <chr> "train", "train", "train", "train", "train", "train"...
## $ month <date> 2014-08-01, 2014-08-01, 2014-08-01, 2014-08-01, 201...
## $ is_brz_holiday <chr> "Noholiday", "Noholiday", "Noholiday", "Noholiday", ...
## $ brz_season <chr> "Winter", "Winter", "Winter", "Winter", "Winter", "W...
summary(baba_all)
## date mes weekday margem
## Min. :2014-08-01 Length:396 Length:396 Min. :0.2907
## 1st Qu.:2014-11-07 Class :character Class :character 1st Qu.:0.3895
## Median :2015-02-14 Mode :character Mode :character Median :0.4159
## Mean :2015-02-14 Mean :0.4142
## 3rd Qu.:2015-05-24 3rd Qu.:0.4372
## Max. :2015-08-31 Max. :0.5142
##
## venda desconto outdesc outmg
## Min. : 16509 Min. : 32.65 Min. :0.000000 Min. :0.000000
## 1st Qu.: 51103 1st Qu.: 3274.98 1st Qu.:0.000000 1st Qu.:0.000000
## Median : 75280 Median : 5400.32 Median :0.000000 Median :0.000000
## Mean : 80618 Mean : 6735.14 Mean :0.007576 Mean :0.005051
## 3rd Qu.:105553 3rd Qu.: 9142.65 3rd Qu.:0.000000 3rd Qu.:0.000000
## Max. :577017 Max. :48454.41 Max. :1.000000 Max. :1.000000
## NA's :31
## conj month is_brz_holiday brz_season
## Length:396 Min. :2014-08-01 Length:396 Length:396
## Class :character 1st Qu.:2014-11-01 Class :character Class :character
## Mode :character Median :2015-02-01 Mode :character Mode :character
## Mean :2015-01-30
## 3rd Qu.:2015-05-01
## Max. :2015-08-01
##
Analise Exploratória das variáveis
# Vendas por dia #
ggplotly(
baba_all %>%
filter(conj == "train") %>%
ggplot(aes(x = date, y = venda)) +
geom_line(color = "black") +
labs(
x = "Data",
y = "Vendas por dia"
)+
theme_bw()
)
# Vendas por mês #
ggplotly(
baba_all %>%
filter(conj == "train") %>%
group_by(month) %>%
summarise(vendas = sum(venda)) %>%
ggplot(aes(x = month, y = vendas)) +
geom_col(color = "blue", fill = "white") +
labs(
x = "Data",
y = "Vendas por mes"
)+
theme_bw()
)
# vendas por estação do ano #
ggplotly(
baba_all %>%
filter(conj == "train") %>%
group_by(brz_season) %>%
summarise(vendas = sum(venda)) %>%
ggplot(aes(x = brz_season, y = vendas, fill = brz_season)) +
geom_col(color = "blue") +
labs(
x = "Data",
y = "Vendas por estação"
)+
theme_bw()
)
# Proporção vendas feriado #
ggplotly(
baba_all %>%
filter(conj == "train") %>%
group_by(is_brz_holiday) %>%
summarise(vendas = sum(venda)/n()) %>%
ggplot(aes(x = is_brz_holiday, y = vendas)) +
geom_col(color = "blue") +
labs(
x = "Data",
y = "Vendas proporção feriado sim/não"
)+
theme_bw()
)
# Boxplot Vendas por dia da semana #
ggplotly(baba_all %>%
filter(conj == "train") %>%
ggplot( aes(x = weekday, y = venda, fill = weekday))+
geom_boxplot()+
labs(
x = "weekday",
y = "Vendas"
)+
theme_bw()
)
# dispersão venda, desconto
plot(baba_train$venda, baba_train$desconto)

# dispersão venda, margem
plot(baba_train$venda, baba_train$margem)

# dispersão desconto, margem
plot(baba_train$desconto, baba_train$margem)

# Histograma de vendas
hist(baba_train$venda)

# Hist de margem
hist(baba_train$margem)

# Hist de desconto
hist(baba_train$desconto)

# Grafico de correlação das variáveis numéricas
baba_all %>%
filter(conj == "train") %>%
select_if(is.numeric) %>%
cor() %>%
corrplot(type = "upper")

Desenvolvimento do modelo de regressão linear, identificando e retirando outliers através do metodo de Distância de Cook
## Modelo de Regressao linear Multipla
baba_train <- baba_all %>%
filter(conj == "train") %>%
select(-conj, -date, -outdesc, -outmg, -month)
modelo <- lm(formula = venda ~ .,
data = baba_train)
summary(modelo)
##
## Call:
## lm(formula = venda ~ ., data = baba_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -60314 -10843 -1809 9185 118606
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.031e+05 1.872e+04 10.846 < 2e-16 ***
## mesagosto -5.487e+04 8.083e+03 -6.788 5.06e-11 ***
## mesdezembro -3.461e+04 7.811e+03 -4.431 1.27e-05 ***
## mesfevereiro -1.026e+04 7.485e+03 -1.371 0.171218
## mesjaneiro -9.923e+03 7.452e+03 -1.331 0.183917
## mesjulho -3.638e+04 7.494e+03 -4.854 1.84e-06 ***
## mesjunho -1.741e+04 5.129e+03 -3.394 0.000769 ***
## mesmaio -7.343e+02 4.667e+03 -0.157 0.875065
## mesmar?o -3.263e+03 5.846e+03 -0.558 0.577073
## mesnovembro -1.733e+04 8.345e+03 -2.076 0.038612 *
## mesoutubro -2.498e+04 8.318e+03 -3.003 0.002868 **
## messetembro -5.197e+04 7.487e+03 -6.940 1.98e-11 ***
## weekday2 -5.996e+03 3.531e+03 -1.698 0.090431 .
## weekday3 -8.923e+03 3.530e+03 -2.528 0.011924 *
## weekday4 -1.468e+04 3.544e+03 -4.142 4.35e-05 ***
## weekday5 -1.320e+04 3.526e+03 -3.744 0.000213 ***
## weekday6 -2.475e+04 3.652e+03 -6.776 5.43e-11 ***
## weekday7 -2.212e+04 3.623e+03 -6.105 2.80e-09 ***
## margem -3.820e+05 4.085e+04 -9.352 < 2e-16 ***
## desconto 8.442e+00 2.929e-01 28.824 < 2e-16 ***
## is_brz_holidayNoholiday 8.669e+03 6.611e+03 1.311 0.190608
## brz_seasonSpring -5.163e+02 7.031e+03 -0.073 0.941507
## brz_seasonSummer -4.374e+03 5.886e+03 -0.743 0.457842
## brz_seasonWinter 1.792e+04 5.925e+03 3.024 0.002683 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 17960 on 341 degrees of freedom
## Multiple R-squared: 0.835, Adjusted R-squared: 0.8238
## F-statistic: 75.02 on 23 and 341 DF, p-value: < 2.2e-16
# Retirada de outliers identificados pelo metodo Distancia de Cook
cooksd <- cooks.distance(modelo)
baba_train$cooktest <- cooksd
baba_train$outlier <- ifelse(2*mean(cooksd) < baba_train$cooktest, 1, 0) #Identificar valores 2 vezes maior que a média da distancia de cook
baba_train <- baba_train %>%
filter(outlier != 1) %>%
select(-cooktest, -outlier)
## Novo modelo sem outliers
modelo <- lm(formula = venda ~ .,
data = baba_train)
summary(modelo)
##
## Call:
## lm(formula = venda ~ ., data = baba_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -27353 -8684 -1958 6992 42235
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.610e+05 1.534e+04 10.494 < 2e-16 ***
## mesagosto -3.066e+04 6.650e+03 -4.610 5.76e-06 ***
## mesdezembro -3.457e+04 6.164e+03 -5.609 4.30e-08 ***
## mesfevereiro -1.120e+04 5.807e+03 -1.928 0.054706 .
## mesjaneiro -7.538e+03 5.815e+03 -1.296 0.195792
## mesjulho -3.257e+04 5.847e+03 -5.569 5.30e-08 ***
## mesjunho -1.484e+04 3.967e+03 -3.742 0.000215 ***
## mesmaio -2.168e+03 3.625e+03 -0.598 0.550321
## mesmar?o -1.579e+03 4.539e+03 -0.348 0.728216
## mesnovembro -2.334e+04 6.630e+03 -3.521 0.000490 ***
## mesoutubro -2.625e+04 6.647e+03 -3.950 9.56e-05 ***
## messetembro -3.285e+04 5.958e+03 -5.514 7.07e-08 ***
## weekday2 -7.978e+03 2.748e+03 -2.903 0.003950 **
## weekday3 -1.111e+04 2.794e+03 -3.976 8.59e-05 ***
## weekday4 -1.586e+04 2.772e+03 -5.722 2.36e-08 ***
## weekday5 -1.907e+04 2.782e+03 -6.855 3.50e-11 ***
## weekday6 -3.073e+04 2.890e+03 -10.634 < 2e-16 ***
## weekday7 -2.735e+04 2.849e+03 -9.603 < 2e-16 ***
## margem -2.789e+05 3.419e+04 -8.158 7.18e-15 ***
## desconto 6.749e+00 3.240e-01 20.828 < 2e-16 ***
## is_brz_holidayNoholiday 1.496e+04 5.407e+03 2.767 0.005984 **
## brz_seasonSpring 1.296e+04 5.589e+03 2.319 0.021016 *
## brz_seasonSummer 4.512e+03 4.588e+03 0.984 0.326068
## brz_seasonWinter 1.177e+04 4.605e+03 2.557 0.011016 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 13740 on 331 degrees of freedom
## Multiple R-squared: 0.8417, Adjusted R-squared: 0.8307
## F-statistic: 76.51 on 23 and 331 DF, p-value: < 2.2e-16